Help! Why does the code always read the last column of my dataset and throw an error
from collections import Counter import numpy as np import pandas as pd import pickle
from sklearn.model_selection import train_test_split from sklearn import svm, neighbors from sklearn.ensemble import VotingClassifier, RandomForestClassifier
# calculate the percentage change in labels to normlaize the features
# Create new columns to hold the values of the percentage change of that day for i in range(1, hm_days+1): df['{}_{}d'.format(ticker, i)] = (df[ticker].shift(-i) - df[ticker]) / df[ticker]
df.fillna(0, inplace=True) return tickers, df
# process_data_for_labels('XOM')
def buy_sell_hold(*args): # Breaks down the series into an array of its values cols = [c for c in args] requirement = 0.025 for col in cols: if col > requirement: return 1 if col < -requirement: return -1 return 0
def extract_featuresets(ticker): # returns ticker and new data frame with forward data joined tickers, df = process_data_for_labels(ticker)
# Apply buy_sell_hold to each series that has been broken down to an array by the for loop # The multiple parameters of list will be taken as a sum and consolidated as a single list df['{}_target'.format(ticker)] = list(map(buy_sell_hold, df['{}_1d'.format(ticker)], df['{}_2d'.format(ticker)], df['{}_3d'.format(ticker)], df['{}_4d'.format(ticker)], df['{}_5d'.format(ticker)], df['{}_6d'.format(ticker)], df['{}_7d'.format(ticker)]))
print df.columns
vals = df['{}_target'.format(ticker)].values.tolist() str_vals = [str(i) for i in vals] print 'Data spread:', Counter(str_vals)
df.fillna(0, inplace=True)
# replace infinity values with nan df = df.replace([np.inf, -np.inf], np.nan) df.dropna(inplace=True)
# Returns a dataframe of the percentage change from yesterday (price today - price yesterday) df_vals = df[[ticker for ticker in tickers]].pct_change() df_vals = df_vals.replace([np.inf, -np.inf], 0) df_vals.fillna(0, inplace=True)
X = df_vals.values y = df['{}_target'.format(ticker)].values
return X,y,df
def do_ml(ticker): X, y, df = extract_featuresets(ticker)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25)
clf.fit(X_train, y_train) confidence = clf.score(X_test, y_test) print 'Accuracy', confidence predictions = clf.predict(X_test) # The spread is to see whether our predictions are skewed because the model thinks that a certain result is alot more accuarate print 'Predicted spread: ', Counter(predictions)
return confidence
do_ml('AAPL')
My csv file is a table with multiple columns that end at the stock ticker name BBBY.
And this is the output and error I keep getting when I call do_ml('AAPL')
Traceback (most recent call last): File "finance_12.py", line 68, in <module> extract_featuresets('AAPL') File "finance_12.py", line 64, in extract_featuresets y = df['{}_target'.format(ticker)].values File "/Users/harrisleow/anaconda/lib/python2.7/site-packages/pandas/core/frame.py", line 2059, in __getitem__ return self._getitem_column(key) File "/Users/harrisleow/anaconda/lib/python2.7/site-packages/pandas/core/frame.py", line 2066, in _getitem_column return self._get_item_cache(key) File "/Users/harrisleow/anaconda/lib/python2.7/site-packages/pandas/core/generic.py", line 1386, in _get_item_cache values = self._data.get(item) File "/Users/harrisleow/anaconda/lib/python2.7/site-packages/pandas/core/internals.py", line 3543, in get loc = self.items.get_loc(item) File "/Users/harrisleow/anaconda/lib/python2.7/site-packages/pandas/indexes/base.py", line 2136, in get_loc return self._engine.get_loc(self._maybe_cast_indexer(key)) File "pandas/index.pyx", line 132, in pandas.index.IndexEngine.get_loc (pandas/index.c:4433) File "pandas/index.pyx", line 154, in pandas.index.IndexEngine.get_loc (pandas/index.c:4279) File "pandas/src/hashtable_class_helper.pxi", line 732, in pandas.hashtable.PyObjectHashTable.get_item (pandas/hashtable.c:13742) File "pandas/src/hashtable_class_helper.pxi", line 740, in pandas.hashtable.PyObjectHashTable.get_item (pandas/hashtable.c:13696) KeyError: 'BBBY_target'
You must be logged in to post. Please login or register an account.
Ok I solved it. We shouldn't be naming the ticker variable as ticker in df_vals = df[[ticker for ticker in tickers]].pct_change() as it will replace the ticker parameter passed from the function as the last ticker name in your csv file
-Harris 7 years ago
You must be logged in to post. Please login or register an account.
can u please post the changes in code to execute thanks
-krishnagutta 7 years ago
You must be logged in to post. Please login or register an account.